In [1]:
import numpy as np
import pandas as pd
import os
import re
import tensorflow as tf
from threading import Thread
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import init_notebook_mode
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Activation, Dropout, Flatten, Dense, Input, Layer
from tensorflow.keras.applications import VGG16, ResNet50, DenseNet201, Xception
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

init_notebook_mode(connected=True)

Product Recommendations: Visually Similar Content Filtering using KNNs

  • A recommender system, or a recommendation system, is a subclass of information filtering system that provide suggestions for items that are most pertinent to a particular user.Typically, the suggestions refer to various decision-making processes, such as what product to purchase, what music to listen to, or what online news to read. Recommender systems are particularly useful when an individual needs to choose an item from a potentially overwhelming number of items that a service may offer.
  • Recommender Systems can be broadly classified into 3 types
    • Collaborative Filtering
    • Content-Based Filtering
    • Hybrid
  • This notebook will demonstrate the Content-Based filtering, which are based on the description of an item and a profile of the user’s preferred choices. In a content-based recommendation system, features are used to describe the items, besides, a user profile is built to state the type of item this user likes. In other words, the algorithms try to recommend products that are similar to the ones that a user has liked in the past.
  • Although we do not have any kind of user profile data, we will use K-Nearest Neighbours algorithms to recommend products which have visually similar features, such as the ones you see in shopping websites e.g. "Products that are similar to this"

image.png

EDA and Visualization

  • Lets first merge the image data and product meta data to get the required dataset
  • We will look at some of the categories and subcategories to understand which categories are more dominant

Images Dataframe

In [2]:
images_df = pd.read_csv("../input/fashion-product-images-dataset/fashion-dataset/images.csv")

Product Meta Data Dataframe

In [3]:
styles_df = pd.read_csv("../input/fashion-product-images-dataset/fashion-dataset/styles.csv", on_bad_lines='skip')

Create Unique ID in both Dataframes

In [4]:
images_df['id'] = images_df['filename'].apply(lambda x: x.replace(".jpg","")).astype(int)
In [5]:
images_df
Out[5]:
filename link id
0 15970.jpg http://assets.myntassets.com/v1/images/style/p... 15970
1 39386.jpg http://assets.myntassets.com/v1/images/style/p... 39386
2 59263.jpg http://assets.myntassets.com/v1/images/style/p... 59263
3 21379.jpg http://assets.myntassets.com/v1/images/style/p... 21379
4 53759.jpg http://assets.myntassets.com/v1/images/style/p... 53759
... ... ... ...
44441 17036.jpg http://assets.myntassets.com/v1/images/style/p... 17036
44442 6461.jpg http://assets.myntassets.com/v1/images/style/p... 6461
44443 18842.jpg http://assets.myntassets.com/v1/images/style/p... 18842
44444 46694.jpg http://assets.myntassets.com/v1/images/style/p... 46694
44445 51623.jpg http://assets.myntassets.com/assets/images/516... 51623

44446 rows × 3 columns

Merging the Two Dataframes

In [6]:
data = styles_df.merge(images_df,on='id',how='left').reset_index(drop=True)
data['filename'] = data['filename'].apply(lambda x: os.path.join("../input/fashion-product-images-dataset/fashion-dataset/images/",x))
In [7]:
image_files = os.listdir("../input/fashion-product-images-dataset/fashion-dataset/images")

Removing Products for which images are not present

In [8]:
data['file_found'] = data['id'].apply(lambda x: f"{x}.jpg" in image_files)
In [9]:
data = data[data['file_found']].reset_index(drop=True)

Final Data

In [10]:
data.head()
Out[10]:
id gender masterCategory subCategory articleType baseColour season year usage productDisplayName filename link file_found
0 15970 Men Apparel Topwear Shirts Navy Blue Fall 2011.0 Casual Turtle Check Men Navy Blue Shirt ../input/fashion-product-images-dataset/fashio... http://assets.myntassets.com/v1/images/style/p... True
1 39386 Men Apparel Bottomwear Jeans Blue Summer 2012.0 Casual Peter England Men Party Blue Jeans ../input/fashion-product-images-dataset/fashio... http://assets.myntassets.com/v1/images/style/p... True
2 59263 Women Accessories Watches Watches Silver Winter 2016.0 Casual Titan Women Silver Watch ../input/fashion-product-images-dataset/fashio... http://assets.myntassets.com/v1/images/style/p... True
3 21379 Men Apparel Bottomwear Track Pants Black Fall 2011.0 Casual Manchester United Men Solid Black Track Pants ../input/fashion-product-images-dataset/fashio... http://assets.myntassets.com/v1/images/style/p... True
4 53759 Men Apparel Topwear Tshirts Grey Summer 2012.0 Casual Puma Men Grey T-shirt ../input/fashion-product-images-dataset/fashio... http://assets.myntassets.com/v1/images/style/p... True

Checking for Null Values

In [11]:
data.isnull().sum()
Out[11]:
id                      0
gender                  0
masterCategory          0
subCategory             0
articleType             0
baseColour             15
season                 21
year                    1
usage                 317
productDisplayName      7
filename                0
link                    0
file_found              0
dtype: int64

Visualizations

  • Main Categories Count
  • Sub Categories Count
  • Products by Season Count
  • Product Usage type Count
In [12]:
fig = px.bar(data.groupby('masterCategory').count().reset_index(), x='masterCategory',y='id',title='Count per Product Category')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
In [13]:
fig = px.bar(data.groupby('subCategory').count().reset_index(), x='subCategory',y='id',title='Count per Product Sub-category', color='subCategory')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
In [14]:
fig = px.bar(data.groupby('season').count().reset_index(), x='season', y='id', title='Count per Season Category')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
In [15]:
fig = px.bar(data.groupby('usage').count().reset_index(), x='usage', y='id', title='Count per Usage Category')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})

Remove Unneccessary Columns

  • As for now we will delete the product display name, but in the future versions of the notebook we will also use the text features
In [16]:
data.drop(columns=['productDisplayName','link','file_found'],inplace=True)
data
Out[16]:
id gender masterCategory subCategory articleType baseColour season year usage filename
0 15970 Men Apparel Topwear Shirts Navy Blue Fall 2011.0 Casual ../input/fashion-product-images-dataset/fashio...
1 39386 Men Apparel Bottomwear Jeans Blue Summer 2012.0 Casual ../input/fashion-product-images-dataset/fashio...
2 59263 Women Accessories Watches Watches Silver Winter 2016.0 Casual ../input/fashion-product-images-dataset/fashio...
3 21379 Men Apparel Bottomwear Track Pants Black Fall 2011.0 Casual ../input/fashion-product-images-dataset/fashio...
4 53759 Men Apparel Topwear Tshirts Grey Summer 2012.0 Casual ../input/fashion-product-images-dataset/fashio...
... ... ... ... ... ... ... ... ... ... ...
44414 17036 Men Footwear Shoes Casual Shoes White Summer 2013.0 Casual ../input/fashion-product-images-dataset/fashio...
44415 6461 Men Footwear Flip Flops Flip Flops Red Summer 2011.0 Casual ../input/fashion-product-images-dataset/fashio...
44416 18842 Men Apparel Topwear Tshirts Blue Fall 2011.0 Casual ../input/fashion-product-images-dataset/fashio...
44417 46694 Women Personal Care Fragrance Perfume and Body Mist Blue Spring 2017.0 Casual ../input/fashion-product-images-dataset/fashio...
44418 51623 Women Accessories Watches Watches Pink Winter 2016.0 Casual ../input/fashion-product-images-dataset/fashio...

44419 rows × 10 columns

Train-Val Split

  • Although we won't be performing a traditional training validation approach, we still want to perform some of the steps on the training set and use those learned weights on the validation set
  • Shuffle the data randomly
  • 80% data for train
  • 20% data for validation
In [17]:
data = data.sample(frac=1).reset_index(drop=True)
n = len(data)
train = data.iloc[:int(n*0.8),:]
val = data.iloc[int(n*0.8):,:].reset_index(drop=True)

Data Generator

In [19]:
datagen = ImageDataGenerator(rescale=1/255.)

train_generator = datagen.flow_from_dataframe(dataframe=train,
                                             target_size=(256,256),
                                             x_col='filename',
                                             class_mode=None,
                                             batch_size=32,
                                             shuffle=False,
                                             classes=['images'])

val_generator = datagen.flow_from_dataframe(dataframe=val,
                                             target_size=(256,256),
                                             x_col='filename',
                                             class_mode=None,
                                             batch_size=32,
                                             shuffle=False,
                                             classes=['images'])
Found 35535 validated image filenames.
Found 8884 validated image filenames.

Feature Extraction: Pre-trained VGG16

  • We will extract the features of the image using the pre-trained deep neural VGG16 network
  • We will extract the final features of the network by using global average pooling on the final convolutional block of the network
In [21]:
base_model = VGG16(include_top=False,input_shape=(256,256,3))

model = Sequential()
for layer in base_model.layers:
    model.add(layer)
model.add(GlobalAveragePooling2D())
model.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
block1_conv1 (Conv2D)        (None, 256, 256, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 256, 256, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 128, 128, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 128, 128, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 128, 128, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 64, 64, 128)       0         
_________________________________________________________________
block3_conv1 (Conv2D)        (None, 64, 64, 256)       295168    
_________________________________________________________________
block3_conv2 (Conv2D)        (None, 64, 64, 256)       590080    
_________________________________________________________________
block3_conv3 (Conv2D)        (None, 64, 64, 256)       590080    
_________________________________________________________________
block3_pool (MaxPooling2D)   (None, 32, 32, 256)       0         
_________________________________________________________________
block4_conv1 (Conv2D)        (None, 32, 32, 512)       1180160   
_________________________________________________________________
block4_conv2 (Conv2D)        (None, 32, 32, 512)       2359808   
_________________________________________________________________
block4_conv3 (Conv2D)        (None, 32, 32, 512)       2359808   
_________________________________________________________________
block4_pool (MaxPooling2D)   (None, 16, 16, 512)       0         
_________________________________________________________________
block5_conv1 (Conv2D)        (None, 16, 16, 512)       2359808   
_________________________________________________________________
block5_conv2 (Conv2D)        (None, 16, 16, 512)       2359808   
_________________________________________________________________
block5_conv3 (Conv2D)        (None, 16, 16, 512)       2359808   
_________________________________________________________________
block5_pool (MaxPooling2D)   (None, 8, 8, 512)         0         
_________________________________________________________________
global_average_pooling2d_1 ( (None, 512)               0         
=================================================================
Total params: 14,714,688
Trainable params: 14,714,688
Non-trainable params: 0
_________________________________________________________________

Extracting Features of Training and Validation Set

In [22]:
train_features = model.predict(train_generator,verbose=1)
val_features = model.predict(val_generator,verbose=1)
2022-07-07 18:11:05.491363: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-07-07 18:11:07.666070: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005
1111/1111 [==============================] - 1584s 1s/step
278/278 [==============================] - 403s 1s/step

Dimensionality Reduction

  • The features extracted using the VGG16 Network product a vector of 512 features. So if we have lets say 10000 products the matrix of our data will be of the dimension 10000 x 512
  • Lets see if we can reduce the dimension of this matrix
  • For a linear projection based dimensionality reduction method to work there should be high correlation amongst the features
  • But to visualize a correlation coeff table of such a high dimensional matrix will be highly resource utilizing
  • Lets perform the PCA and see the variance explaination of the principal components to check if our linear projection approach works
In [23]:
from sklearn.decomposition import PCA

Illustration of how PCA finds the axes where the within data variability will be maximum

  • Lets proceed with 2 principal components first so that we can visualize them in 2D
  • We will then look at the Cumulative Variance Explanation to find the right amount of principal components

In [24]:
pca = PCA(2)
pca.fit(train_features)
train_pca = pca.transform(train_features)
In [25]:
test_pca = pca.fit_transform(val_features)
In [26]:
train_pca = pd.DataFrame(train_pca)
train = train.iloc[:,0:10]
train = train.merge(train_pca, how='left', left_index=True, right_index=True)

Visualization: Principal Components

In [79]:
fig = px.scatter(train, x=0, y=1, color="masterCategory", title='Main Category', height=600, labels={
                     "0": "Principal Component 1",
                     "1": "Principal Component 2"})
fig.show()
In [80]:
fig = px.scatter(train, x=0, y=1, color="gender", title='Gender', height=600, labels={
                     "0": "Principal Component 1",
                     "1": "Principal Component 2"})
fig.show()
In [81]:
fig = px.scatter(train, x=0, y=1, color="subCategory", title='Sub Category', height=600, labels={
                     "0": "Principal Component 1",
                     "1": "Principal Component 2"})
fig.show()
In [82]:
fig = px.scatter(train[train['season'].notna()], x=0, y=1, color="season", title='Season', height=600, labels={
                     "0": "Principal Component 1",
                     "1": "Principal Component 2"})
fig.show()
In [83]:
fig = px.scatter(train[train['usage'].notna()], x=0, y=1, color="usage", title='Usage', height=600, labels={
                     "0": "Principal Component 1",
                     "1": "Principal Component 2"})
fig.show()

Inference:

  • The 2 principal components show a reasonably good separability in terms of main product categories
  • Maybe after taking more principal components the seprability will be more evident
In [56]:
pca = PCA()
pca.fit(train_features)
train_pca = pca.transform(train_features)
variance_explained = np.cumsum(pca.explained_variance_ratio_)
pcs = range(1,len(variance_explained)+1)

Reduced Dimensions: 512 -> 313

  • First 313 Principal Components explain 99% of the variance in data
  • We will reduce the image feature dimensions to 313
In [72]:
px.line(x = pcs, y = variance_explained, title = 'Principal Components Cumulative Explained Variance', height=600,  labels={
                     "x": "Principal Components",
                     "y": "Explained Variance"})
In [61]:
val_pca = pca.fit_transform(val_features)[:,:313]
val_pca = pd.DataFrame(val_pca)
val = val.iloc[:,0:10]
val = val.merge(val_pca, how='left', left_index=True, right_index=True)
In [63]:
X = val.iloc[:,-313:]
y = val['id']

K-Nearest Neighbours

  • Lets use k=6, to recommend 6 most similar looking products
  • We choose 6 because the first product out of the 6 similar looking products will be the query product itself
  • Therefore we can now look at the 5 most similar products based on raw extracted features from the pre-trained network
  • We will not use the predict method for classification prediction, but the KNeighbours method to find the k or 6 most nearest neighbours which have visually similar content
  • The evaluation of such recommendation techniques is usually done using something like hit-rate, which is an indicator of if the user bought the product recommended to them

image.png

In [64]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=6)
neigh.fit(X, y)
Out[64]:
KNeighborsClassifier(n_neighbors=6)
In [65]:
def read_img(image_path):
    image = load_img(image_path,target_size=(256,256,3))
    image = img_to_array(image)
    image = image/255.
    return image

Results

  • The results can definitely improved by creating data of similar styled or branded products
  • A siamese network can then be trained to create image features where similarly styled or branded products are in closer proximity
In [66]:
import random
In [69]:
for _ in range(10):
    i = random.randint(1,len(val))
    img1 = read_img(val.loc[i,'filename'])
    dist, index = neigh.kneighbors(X=X.iloc[i,:].values.reshape(1,-1))
    plt.figure(figsize = (4 , 4))
    plt.imshow(img1)
    plt.title("Input Image")

    plt.figure(figsize = (20 , 20))
    for i in range(1,6):
        plt.subplot(1 , 5, i)
        plt.subplots_adjust(hspace = 0.5 , wspace = 0.3)
        image = read_img(val.loc[index[0][i],'filename'])
        plt.imshow(image)
        plt.title(f'Similar Product #{i}')